# Dickstein, Ho, and Mark (2023)
# This code creates a cleaned up household (subscriber)-level dataset from the Oregon Health Authority’s APAC dataset
# for the years 2014-2016 that keeps in households used in the counterfactual exercises but not in the estimation procedure.

# * # * # * # * # * # * #
# PRELIMINARIES         #
# * # * # * # * # * # * #

setwd("../library")
source("PreliminariesCode.R")

# * # * # * # * # * # * #
# READ IN DERIVED DATA  #
# * # * # * # * # * # * #

## Loading Subscriber Data:
SubDat_List<- list()
for (i in 5:7){
  SubDat_List[[i]] <- fread(paste0("orig/SubData", Shortyear.Vec[i], "7.csv"))
  print(paste0("Before the procedure, there are ", nrow(SubDat_List[[i]]), "observations"))
}
SubDat <- do.call("rbind", SubDat_List[5:7])
is.identified(SubDat, c("subscriberid", "year"))

# Loading Constructed Plan Data
constructed_plan_derived <- fread("orig/ConstructedPlanDat.csv")
constructed_plan_derived <- constructed_plan_derived[year %in% 2014:2016]
is.identified(constructed_plan_derived, c("constrplanid", "year"))

# Loading All Choice Premiums of the Pre-Merge market
all_choice_premiums <- fread("AllChoicePremiums.csv")
all_choice_premiums <- all_choice_premiums[year %in% 2014:2016]
is.identified(all_choice_premiums, c("subscriberid", "constructed_plan_year"))

# * # * # * # * # * # * #
# FORMAT DERIVED DATA   #
# * # * # * # * # * # * #

# Adjusting Variables in all_choice_premiums Data
all_choice_premiums[, mnc_plantype := MNC, ]
all_choice_premiums[, constructed_plan_year := paste0(
  substr(Market.Type, 0, 2),
  substr(Exchange, 0, 2),
  ratingarea,
  substr(PAYER_ID, 4, 5),
  metal,
  mnc_plantype,
  "_",
  year), ]

# Adjusting Variables in SubDat Data
SubDat[, mnc_plantype := if_else(best_guess_plantype %in% c("EPO", "HMO"), "MNC", "NOT"), ]
SubDat[, constructed_plan_year := paste0(
  substr(markettype, 0, 2),
  substr(exchange, 0, 2),
  best_guess_ra,
  substr(payer_id, 4, 5),
  best_guess_metal,
  mnc_plantype,
  "_",
  year
), ]

# Merging with all_choice Premiums to get variables of each household's chosen plan
SubDat <- merge(
  SubDat,
  all_choice_premiums[, .(
    subscriberid, year, constructed_plan_year, grossprem_old = grossprem, best_guess_Subsidy_old = best_guess_Subsidy, 
    best_guess_AV_old = best_guess_AV, best_guess_netprem_old = best_guess_netprem), ],
  by = c("subscriberid", "constructed_plan_year", "year"),
  all.x = T)

print("Formatting Complete")

# * # * # * # * # * # * #
# SUBSETTING            #
# * # * # * # * # * # * #

n_stage <- nrow(SubDat)
names(n_stage) <- "Full"

SubDat <-  SubDat[ (!is.na(age)) ]
n_stage <- c(n_stage, nrow(SubDat))
names(n_stage)[which(names(n_stage) == "")] <- "Non-Missing Age"

SubDat <-  SubDat[ age >= 20 & age <= 65]
n_stage <- c(n_stage, nrow(SubDat))
names(n_stage)[which(names(n_stage) == "")] <- "Age in 20-65 Range"

SubDat <- SubDat[ (best_guess_market %in% c(1:2, 4)) ]
n_stage <- c(n_stage, nrow(SubDat))
names(n_stage)[which(names(n_stage) == "")] <- "Markets 1,2, 4"

SubDat <- SubDat[ (!is.na(nspouse)) ]
n_stage <- c(n_stage, nrow(SubDat))
names(n_stage)[which(names(n_stage) == "")] <- "Non-Missing nspouse"

SubDat <- SubDat[ (!is.na(ndeps)) ]
n_stage <- c(n_stage, nrow(SubDat))
names(n_stage)[which(names(n_stage) == "")] <- "Non-Missing ndeps"

SubDat <- SubDat[ (!is.na(sum_concurrent_risk)) ]
n_stage <- c(n_stage, nrow(SubDat))
names(n_stage)[which(names(n_stage) == "")] <- "Non-Missing ACG"

SubDat <-  SubDat[ (!is.na(best_guess_incomeoverFPL)) ]
n_stage <- c(n_stage, nrow(SubDat))
names(n_stage)[which(names(n_stage) == "")] <- "Non-Missing Income"

print("Subsetting Complete")

# * # * # * # * # * # * #
# CREATE NEW VARIABLES  #
# * # * # * # * # * # * #
income_cutbreaks <- c(0, 2.5, 8)
age_cutbreaks <- c(0, 17, 24, 34, 44, 54, 65, 100)

SubDat[, married := (nspouse > 0), ]
SubDat[, withkids := (ndeps > 0), ]
SubDat[, exog_exchange := substr(exchange, 0, 2), ]
SubDat[, married_kids := if_else(
  married,
  if_else(withkids, 2, 1),
  if_else(withkids, 3, 0)), ]
SubDat[, urban := best_guess_ra %in% c(1, 2, 3, 7), ]
SubDat[, portland := best_guess_ra %in% c(1), ]
SubDat[, fpl_bins := cut(
  as.numeric(best_guess_incomeoverFPL),
  breaks = income_cutbreaks, label = FALSE), ]
SubDat[, fpl_bins_label := cut(
  as.numeric(best_guess_incomeoverFPL),
  breaks = income_cutbreaks), ]
SubDat[, age_bins := cut(age, breaks = age_cutbreaks, label = FALSE), ]
SubDat[, age_bins_label := cut(age, breaks = age_cutbreaks), ]
SubDat[, acg_quartiles := quantcut(sum_concurrent_risk, 4, labels=FALSE), ]
SubDat[, acg_quartiles_label := quantcut(sum_concurrent_risk, 4), ]
SubDat[, acg_max_quartiles := quantcut(max_concurrent_risk, 4, labels=FALSE), ]
SubDat[, acg_max_quartiles_label := quantcut(max_concurrent_risk, 4), ]

# Creating the two number of months variables:
SubDat[, nummonths_observed := nummonthsmode, ]
SubDat[, nummonths_span := lastmonth - firstmonth + 1, ]

# Creating Monthly cost data
SubDat[, totpaidmonth_observed := totpaid / nummonths_observed,]
SubDat[, totcopaymonth_observed := totcopay / nummonths_observed,]
SubDat[, totcoinsmonth_observed := totcoins / nummonths_observed,]
SubDat[, totdeductmonth_observed := totdeduct / nummonths_observed,]
SubDat[, totpaidmonth_span := totpaid / nummonths_span,]
SubDat[, totcopaymonth_span := totcopay / nummonths_span,]
SubDat[, totcoinsmonth_span := totcoins / nummonths_span,]
SubDat[, totdeductmonth_span := totdeduct / nummonths_span,]

print("New Variable Creation Complete")

# * # * # * # * # * # * # * # * # * #
# SAVING DATASET #
# * # * # * # * # * # * # * # * # * #

write.csv(SubDat, file = "counterfactual_all_subscribers.csv", row.names = F)
